sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.5.1  magrittr_1.5    tools_3.5.1     htmltools_0.3.6
##  [5] yaml_2.2.0      Rcpp_1.0.0      stringi_1.2.4   rmarkdown_1.11 
##  [9] knitr_1.20      stringr_1.3.1   digest_0.6.18   evaluate_0.12
# # https://gist.github.com/smithdanielle/9913897
# check.packages <- function(pkg){
#     new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
#     if (length(new.pkg)) 
#         install.packages(new.pkg, dependencies = TRUE)
#     #sapply(pkg, require, character.only = TRUE)
#     sapply(pkg, library, character.only = TRUE) # see comment below in GitHub repo
# }
# 
# # Usage example
# packages<-c("ggplot2", "dplyr", "caret", "caTools", "neuralnet", "tictoc", "randomForest", "DT", "e1071", "xgboost")
# check.packages(packages)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(DT)

User Inputs

output.var = params$output.var
log.pred = params$log.pred
eda = params$eda

message("Parameters used for training/prediction: ")
## Parameters used for training/prediction:
str(params)
## List of 3
##  $ output.var: chr "y3"
##  $ log.pred  : logi FALSE
##  $ eda       : logi TRUE
# Setup Labels
# alt.scale.label.name = Alternate Scale variable name
#   - if predicting on log, then alt.scale is normal scale
#   - if predicting on normal scale, then alt.scale is log scale
if (log.pred == TRUE){
  label.names = paste('log.',output.var,sep="")
  alt.scale.label.name = output.var
}
if (log.pred == FALSE){
  label.names = output.var
  alt.scale.label.name = paste('log.',output.var,sep="")
}

Prepare Data

Read and Clean Features

features = read.csv("../../Data/features.csv")
str(features) 
## 'data.frame':    10000 obs. of  241 variables:
##  $ JobName: Factor w/ 10000 levels "Job_00001","Job_00002",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ x1     : num  2.073 2.268 1.742 0.787 2.334 ...
##  $ x2     : num  4.92 4.96 2.06 2.61 4.3 ...
##  $ x3     : num  20 19.1 13.4 17.2 14.6 ...
##  $ x4     : num  3.52 19.76 38.83 64.4 52.54 ...
##  $ x5     : num  7.86 6.93 6.27 5.38 6.79 ...
##  $ x6     : num  1.607 1.362 2.053 0.907 2.461 ...
##  $ x7     : num  2.98 2.39 2.04 2.4 2.89 ...
##  $ x8     : num  8.54 6.56 10.28 13.49 9.36 ...
##  $ x9     : num  1.103 0.589 4.834 3.34 1.246 ...
##  $ x10    : num  4.61 1.03 4.39 4.51 1.73 ...
##  $ x11    : num  1.05e-07 1.03e-07 1.06e-07 9.47e-08 1.01e-07 1.07e-07 9.89e-08 9.30e-08 9.70e-08 9.47e-08 ...
##  $ x12    : num  8 7.49 6.35 9.55 9.6 ...
##  $ x13    : num  13.22 22.56 15.05 17.17 5.79 ...
##  $ x14    : num  4.38 2.06 3.26 3.09 3.94 ...
##  $ x15    : num  0.237 0.564 2.06 1.881 1.582 ...
##  $ x16    : num  6.08 6.9 8.42 11.19 7.1 ...
##  $ x17    : num  3.99 4.15 4.49 2.13 3.56 ...
##  $ x18    : num  4.77 6.85 3.49 5.59 7.77 ...
##  $ x19    : num  2.7 9.62 4.72 5.11 1.36 ...
##  $ x20    : num  1.04 1.92 1.56 1.49 1.24 ...
##  $ x21    : num  42.4 26.6 20.1 32.6 44.6 ...
##  $ x22    : num  1.36 4.05 3.08 1.36 1.94 ...
##  $ x23    : num  2.7 2.38 4.49 3.4 2.25 ...
##  $ stat1  : num  2.38 -1.407 -0.767 0.437 2.449 ...
##  $ stat2  : num  0.188 1.814 -0.123 -1.936 -0.617 ...
##  $ stat3  : num  -1.228 1.62 1.142 0.903 -2.552 ...
##  $ stat4  : num  -0.6 2.64 2.98 -1.6 -2.15 ...
##  $ stat5  : num  0.1489 1.9208 2.4226 -0.0018 -2.3111 ...
##  $ stat6  : num  -0.662 1.741 -0.417 -0.695 -1.017 ...
##  $ stat7  : num  -2.485 -1.96 2.221 -0.369 2.727 ...
##  $ stat8  : num  0.365 -2.019 -2.674 -0.971 1.542 ...
##  $ stat9  : num  2.536 -1.373 0.484 1.796 -1.316 ...
##  $ stat10 : num  2.9207 -0.3164 2.7338 0.7477 -0.0977 ...
##  $ stat11 : num  -2.323 -0.855 -2.182 1.398 0.957 ...
##  $ stat12 : num  -2.48 1.12 2.87 1.86 2.57 ...
##  $ stat13 : num  -0.634 0.723 -2.976 -1.038 0.318 ...
##  $ stat14 : num  -0.365 0.212 2.987 2.334 1.031 ...
##  $ stat15 : num  -0.532 -0.145 1.954 2.306 0.164 ...
##  $ stat16 : num  0.603 -2.036 -1.886 -2.895 -0.661 ...
##  $ stat17 : num  -1.0452 0.0951 0.4029 2.9745 -0.9847 ...
##  $ stat18 : num  2.354 0.473 1.466 2.39 0.69 ...
##  $ stat19 : num  2.4 1.89 -1.5 2.31 1.59 ...
##  $ stat20 : num  0.263 2.789 2.916 -1.189 -2.12 ...
##  $ stat21 : num  -0.979 -1.392 -2.389 -2.198 1.796 ...
##  $ stat22 : num  1.787 -1.72 2.816 1.367 -0.936 ...
##  $ stat23 : num  -2.37 -2.33 -2.54 -1.97 2.05 ...
##  $ stat24 : num  2.858 1.558 0.142 -1.408 -2.208 ...
##  $ stat25 : num  -0.472 -1.957 0.357 2.51 -1.928 ...
##  $ stat26 : num  -2.82 1.55 -1.05 1.68 -2.12 ...
##  $ stat27 : num  -0.952 -0.508 -2.154 -0.255 1.818 ...
##  $ stat28 : num  2.8889 -1.5872 0.0307 -2.9038 -1.4217 ...
##  $ stat29 : num  0.799 1.976 -0.446 1.057 0.885 ...
##  $ stat30 : num  -2.006 -0.387 1.028 2.559 2.277 ...
##  $ stat31 : num  -0.246 1.357 1.4 -2.983 2.65 ...
##  $ stat32 : num  0.648 2.649 -1.018 -1.13 2.305 ...
##  $ stat33 : num  -2.8746 2.2846 1.4111 0.0547 -2.3915 ...
##  $ stat34 : num  -0.36 1.86 -2.42 -1.56 -1.83 ...
##  $ stat35 : num  2.429 1.371 -0.981 1.097 -1.097 ...
##  $ stat36 : num  -0.542 -1.371 2.057 -2.282 1.487 ...
##  $ stat37 : num  -2.678 1.39 0.885 1.885 -2.374 ...
##  $ stat38 : num  -2.887 1.227 2.057 0.539 -0.374 ...
##  $ stat39 : num  -0.895 -0.893 1.122 2.733 1.427 ...
##  $ stat40 : num  1.175 1.054 1.853 -0.437 1.255 ...
##  $ stat41 : num  -1.047 2.538 1.148 -1.381 0.226 ...
##  $ stat42 : num  -1.391 1.648 0.229 -2.79 1.954 ...
##  $ stat43 : num  2.5411 0.4413 0.0889 2.383 2.6643 ...
##  $ stat44 : num  -1.432 -2.505 2.304 0.169 0.803 ...
##  $ stat45 : num  0.63 1.273 -0.774 -2.159 -1.552 ...
##  $ stat46 : num  -2.093 1.725 -0.073 1.608 1.618 ...
##  $ stat47 : num  -2.832 -0.58 0.792 -1.889 2.109 ...
##  $ stat48 : num  2.145 -1.369 1.571 0.568 -2.72 ...
##  $ stat49 : num  0.567 1.491 1.104 -0.702 2.196 ...
##  $ stat50 : num  0.154 1.247 -0.255 -0.397 -0.262 ...
##  $ stat51 : num  0.629 0.89 -2.166 0.158 1.211 ...
##  $ stat52 : num  2.22 -2.602 0.266 2.177 0.826 ...
##  $ stat53 : num  2.18 -2.11 1.23 2.54 -2.46 ...
##  $ stat54 : num  0.555 1.386 2.134 -2.139 2.163 ...
##  $ stat55 : num  -2.197 0.0878 1.6523 0.1286 0.6044 ...
##  $ stat56 : num  -0.288 2 -0.439 -1.991 2.545 ...
##  $ stat57 : num  1.323 0.801 -0.181 0.963 -1.498 ...
##  $ stat58 : num  -1.33 -0.27 2.11 1.65 2.61 ...
##  $ stat59 : num  1.2424 0.0638 0.9322 -0.2984 -1.1761 ...
##  $ stat60 : num  -2.58 0.947 2.46 0.727 -1.795 ...
##  $ stat61 : num  1.328 1.117 0.465 -2.313 -2.669 ...
##  $ stat62 : num  1.6856 0.0313 -1.7103 -1.477 0.1781 ...
##  $ stat63 : num  0.628 -2.194 -0.516 2.591 2.896 ...
##  $ stat64 : num  -1.68 0.338 1.828 -1.513 2.941 ...
##  $ stat65 : num  -2.949 -1.117 -0.223 -0.352 -2.165 ...
##  $ stat66 : num  -0.333 -1.573 -0.45 -2.072 1.2 ...
##  $ stat67 : num  1.575 -2.923 0.793 0.944 2.827 ...
##  $ stat68 : num  -2.298 0.266 -1.245 2.921 0.746 ...
##  $ stat69 : num  1.55 -1.96 -2.23 0.51 1.68 ...
##  $ stat70 : num  -1.35 2.51 2.31 -2.44 -1.28 ...
##  $ stat71 : num  1.026 0.353 -2.18 -2.405 1.354 ...
##  $ stat72 : num  2.107 1.692 -2.265 2.088 -0.809 ...
##  $ stat73 : num  2.663 -1.217 0.142 -0.863 -0.512 ...
##  $ stat74 : num  -2.892 -1.727 0.989 0.401 -2.17 ...
##  $ stat75 : num  -0.0213 2.2118 1.9559 -1.1699 1.0734 ...
##   [list output truncated]

Checking correlations to evaluate removal of redundant features

corr.matrix = round(cor(features[sapply(features, is.numeric)]),2)

# filter out only highly correlated variables
threshold = 0.6
corr.matrix.tmp = corr.matrix
diag(corr.matrix.tmp) = 0
high.corr = apply(abs(corr.matrix.tmp) >= threshold, 1, any)
high.corr.matrix = corr.matrix.tmp[high.corr, high.corr]

DT::datatable(corr.matrix)
DT::datatable(high.corr.matrix)

Clean Column Names

Feature Names

feature.names = colnames(features)
drops <- c('JobName')
feature.names = feature.names[!(feature.names %in% drops)]
str(feature.names)
##  chr [1:240] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10" "x11" ...

Read and Clean Labels

labels = read.csv("../../Data/labels.csv")
#str(labels)
labels = labels[,c("JobName", output.var)]
summary(labels)
##       JobName           y3        
##  Job_00001:   1   Min.   : 95.91  
##  Job_00002:   1   1st Qu.:118.21  
##  Job_00003:   1   Median :123.99  
##  Job_00004:   1   Mean   :125.36  
##  Job_00005:   1   3rd Qu.:131.06  
##  Job_00006:   1   Max.   :193.73  
##  (Other)  :9994   NA's   :2497

Clean Column Names

Merge Datasets

data <- merge(features, labels, by = 'JobName')
drops <- c('JobName')
data = data[,(!colnames(data) %in% drops)]
str(data)
## 'data.frame':    10000 obs. of  241 variables:
##  $ x1     : num  2.073 2.268 1.742 0.787 2.334 ...
##  $ x2     : num  4.92 4.96 2.06 2.61 4.3 ...
##  $ x3     : num  20 19.1 13.4 17.2 14.6 ...
##  $ x4     : num  3.52 19.76 38.83 64.4 52.54 ...
##  $ x5     : num  7.86 6.93 6.27 5.38 6.79 ...
##  $ x6     : num  1.607 1.362 2.053 0.907 2.461 ...
##  $ x7     : num  2.98 2.39 2.04 2.4 2.89 ...
##  $ x8     : num  8.54 6.56 10.28 13.49 9.36 ...
##  $ x9     : num  1.103 0.589 4.834 3.34 1.246 ...
##  $ x10    : num  4.61 1.03 4.39 4.51 1.73 ...
##  $ x11    : num  1.05e-07 1.03e-07 1.06e-07 9.47e-08 1.01e-07 1.07e-07 9.89e-08 9.30e-08 9.70e-08 9.47e-08 ...
##  $ x12    : num  8 7.49 6.35 9.55 9.6 ...
##  $ x13    : num  13.22 22.56 15.05 17.17 5.79 ...
##  $ x14    : num  4.38 2.06 3.26 3.09 3.94 ...
##  $ x15    : num  0.237 0.564 2.06 1.881 1.582 ...
##  $ x16    : num  6.08 6.9 8.42 11.19 7.1 ...
##  $ x17    : num  3.99 4.15 4.49 2.13 3.56 ...
##  $ x18    : num  4.77 6.85 3.49 5.59 7.77 ...
##  $ x19    : num  2.7 9.62 4.72 5.11 1.36 ...
##  $ x20    : num  1.04 1.92 1.56 1.49 1.24 ...
##  $ x21    : num  42.4 26.6 20.1 32.6 44.6 ...
##  $ x22    : num  1.36 4.05 3.08 1.36 1.94 ...
##  $ x23    : num  2.7 2.38 4.49 3.4 2.25 ...
##  $ stat1  : num  2.38 -1.407 -0.767 0.437 2.449 ...
##  $ stat2  : num  0.188 1.814 -0.123 -1.936 -0.617 ...
##  $ stat3  : num  -1.228 1.62 1.142 0.903 -2.552 ...
##  $ stat4  : num  -0.6 2.64 2.98 -1.6 -2.15 ...
##  $ stat5  : num  0.1489 1.9208 2.4226 -0.0018 -2.3111 ...
##  $ stat6  : num  -0.662 1.741 -0.417 -0.695 -1.017 ...
##  $ stat7  : num  -2.485 -1.96 2.221 -0.369 2.727 ...
##  $ stat8  : num  0.365 -2.019 -2.674 -0.971 1.542 ...
##  $ stat9  : num  2.536 -1.373 0.484 1.796 -1.316 ...
##  $ stat10 : num  2.9207 -0.3164 2.7338 0.7477 -0.0977 ...
##  $ stat11 : num  -2.323 -0.855 -2.182 1.398 0.957 ...
##  $ stat12 : num  -2.48 1.12 2.87 1.86 2.57 ...
##  $ stat13 : num  -0.634 0.723 -2.976 -1.038 0.318 ...
##  $ stat14 : num  -0.365 0.212 2.987 2.334 1.031 ...
##  $ stat15 : num  -0.532 -0.145 1.954 2.306 0.164 ...
##  $ stat16 : num  0.603 -2.036 -1.886 -2.895 -0.661 ...
##  $ stat17 : num  -1.0452 0.0951 0.4029 2.9745 -0.9847 ...
##  $ stat18 : num  2.354 0.473 1.466 2.39 0.69 ...
##  $ stat19 : num  2.4 1.89 -1.5 2.31 1.59 ...
##  $ stat20 : num  0.263 2.789 2.916 -1.189 -2.12 ...
##  $ stat21 : num  -0.979 -1.392 -2.389 -2.198 1.796 ...
##  $ stat22 : num  1.787 -1.72 2.816 1.367 -0.936 ...
##  $ stat23 : num  -2.37 -2.33 -2.54 -1.97 2.05 ...
##  $ stat24 : num  2.858 1.558 0.142 -1.408 -2.208 ...
##  $ stat25 : num  -0.472 -1.957 0.357 2.51 -1.928 ...
##  $ stat26 : num  -2.82 1.55 -1.05 1.68 -2.12 ...
##  $ stat27 : num  -0.952 -0.508 -2.154 -0.255 1.818 ...
##  $ stat28 : num  2.8889 -1.5872 0.0307 -2.9038 -1.4217 ...
##  $ stat29 : num  0.799 1.976 -0.446 1.057 0.885 ...
##  $ stat30 : num  -2.006 -0.387 1.028 2.559 2.277 ...
##  $ stat31 : num  -0.246 1.357 1.4 -2.983 2.65 ...
##  $ stat32 : num  0.648 2.649 -1.018 -1.13 2.305 ...
##  $ stat33 : num  -2.8746 2.2846 1.4111 0.0547 -2.3915 ...
##  $ stat34 : num  -0.36 1.86 -2.42 -1.56 -1.83 ...
##  $ stat35 : num  2.429 1.371 -0.981 1.097 -1.097 ...
##  $ stat36 : num  -0.542 -1.371 2.057 -2.282 1.487 ...
##  $ stat37 : num  -2.678 1.39 0.885 1.885 -2.374 ...
##  $ stat38 : num  -2.887 1.227 2.057 0.539 -0.374 ...
##  $ stat39 : num  -0.895 -0.893 1.122 2.733 1.427 ...
##  $ stat40 : num  1.175 1.054 1.853 -0.437 1.255 ...
##  $ stat41 : num  -1.047 2.538 1.148 -1.381 0.226 ...
##  $ stat42 : num  -1.391 1.648 0.229 -2.79 1.954 ...
##  $ stat43 : num  2.5411 0.4413 0.0889 2.383 2.6643 ...
##  $ stat44 : num  -1.432 -2.505 2.304 0.169 0.803 ...
##  $ stat45 : num  0.63 1.273 -0.774 -2.159 -1.552 ...
##  $ stat46 : num  -2.093 1.725 -0.073 1.608 1.618 ...
##  $ stat47 : num  -2.832 -0.58 0.792 -1.889 2.109 ...
##  $ stat48 : num  2.145 -1.369 1.571 0.568 -2.72 ...
##  $ stat49 : num  0.567 1.491 1.104 -0.702 2.196 ...
##  $ stat50 : num  0.154 1.247 -0.255 -0.397 -0.262 ...
##  $ stat51 : num  0.629 0.89 -2.166 0.158 1.211 ...
##  $ stat52 : num  2.22 -2.602 0.266 2.177 0.826 ...
##  $ stat53 : num  2.18 -2.11 1.23 2.54 -2.46 ...
##  $ stat54 : num  0.555 1.386 2.134 -2.139 2.163 ...
##  $ stat55 : num  -2.197 0.0878 1.6523 0.1286 0.6044 ...
##  $ stat56 : num  -0.288 2 -0.439 -1.991 2.545 ...
##  $ stat57 : num  1.323 0.801 -0.181 0.963 -1.498 ...
##  $ stat58 : num  -1.33 -0.27 2.11 1.65 2.61 ...
##  $ stat59 : num  1.2424 0.0638 0.9322 -0.2984 -1.1761 ...
##  $ stat60 : num  -2.58 0.947 2.46 0.727 -1.795 ...
##  $ stat61 : num  1.328 1.117 0.465 -2.313 -2.669 ...
##  $ stat62 : num  1.6856 0.0313 -1.7103 -1.477 0.1781 ...
##  $ stat63 : num  0.628 -2.194 -0.516 2.591 2.896 ...
##  $ stat64 : num  -1.68 0.338 1.828 -1.513 2.941 ...
##  $ stat65 : num  -2.949 -1.117 -0.223 -0.352 -2.165 ...
##  $ stat66 : num  -0.333 -1.573 -0.45 -2.072 1.2 ...
##  $ stat67 : num  1.575 -2.923 0.793 0.944 2.827 ...
##  $ stat68 : num  -2.298 0.266 -1.245 2.921 0.746 ...
##  $ stat69 : num  1.55 -1.96 -2.23 0.51 1.68 ...
##  $ stat70 : num  -1.35 2.51 2.31 -2.44 -1.28 ...
##  $ stat71 : num  1.026 0.353 -2.18 -2.405 1.354 ...
##  $ stat72 : num  2.107 1.692 -2.265 2.088 -0.809 ...
##  $ stat73 : num  2.663 -1.217 0.142 -0.863 -0.512 ...
##  $ stat74 : num  -2.892 -1.727 0.989 0.401 -2.17 ...
##  $ stat75 : num  -0.0213 2.2118 1.9559 -1.1699 1.0734 ...
##  $ stat76 : num  -2.506 1.933 0.295 -1.239 2.67 ...
##   [list output truncated]

Transformations

#str(data)
if (log.pred == TRUE){
  data[label.names] = log(data[alt.scale.label.name],10)
  #drops = c(alt.scale.label.name)
  #data = data[!(names(data) %in% drops)]
}
#str(data)

Exploratory Data Analysis

Scatterplots

panel.hist <- function(x, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
if (eda == TRUE){
  hist(data[complete.cases(data),label.names])
  #hist(data[complete.cases(data),alt.scale.label.name])
}

# https://stackoverflow.com/questions/24648729/plot-one-numeric-variable-against-n-numeric-variables-in-n-plots
ind.pairs.plot <- function(data, xvars=NULL, yvar)
{
    df <- data
    if (is.null(xvars)) {
        xvars = names(data[which(names(data)!=yvar)])       
    }   

    # if (length(xvars) > 25) {
    #         print("Warning: number of variables to be plotted exceeds 25, only first 25 will be plotted")
    #         xvars = xvars[1:25]
    # }

    #choose a format to display charts
    ncharts <- length(xvars) 
    # nrows = ceiling(sqrt(ncharts))
    # ncols = ceiling(ncharts/nrows)  
    # par(mfrow = c(nrows,ncols))

    for(i in 1:ncharts){    
        plot(df[,xvars[i]],df[,yvar], xlab = xvars[i], ylab = yvar)
    }
}

ind.pairs.plot(data, feature.names, label.names)

Feature Engineering

# x18 may need transformations
plot(data[,'x18'], data[,label.names], main = "Original Scatter Plot vs. x18", ylab = label.names, xlab = 'x18')

plot(sqrt(data[,'x18']), data[,label.names], main = "Original Scatter Plot vs. sqrt(x18)", ylab = label.names, xlab = 'sqrt(x18)')

plot((data[,'x18'])^2, data[,label.names], main = "Original Scatter Plot vs. square(x18)", ylab = label.names, xlab = 'x18**2')